#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A))
! (C) Copyright 1995- ECMWF.
! (C) Copyright 1995- Meteo-France.
! (C) Copyright 2022- NVIDIA.
!
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRLTOG_MOD
  USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
  IMPLICIT NONE

  PRIVATE
  PUBLIC :: TRLTOG, TRLTOG_HANDLE, PREPARE_TRLTOG

  TYPE TRLTOG_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_AND_COMBUFS
  END TYPE
CONTAINS
  FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT
    USE TPM_DISTR,              ONLY: D
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
    USE ISO_C_BINDING,          ONLY: C_SIZE_T

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS
    TYPE(TRLTOG_HANDLE) :: HTRLTOG

    REAL(KIND=JPRBT) :: DUMMY

    INTEGER(KIND=C_SIZE_T) :: NELEM

    NELEM = ALIGN(KF_GP*D%NGPTOT*SIZEOF(DUMMY),128) ! ZCOMBUFR
    NELEM = ALIGN(NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY),128) !ZCOMBUFS upper obund

    HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM)
  END FUNCTION PREPARE_TRLTOG

  SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,&
     & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,&
     & PGP,PGPUV,PGP3A,PGP3B,PGP2)

    !**** *trltog * - transposition of grid point data from latitudinal
    !   to column structure. This takes place between inverse
    !                 FFT and grid point calculations.
    !                 TRLTOG is the inverse of TRGTOL

    ! Version using CUDA-aware MPI

    !     Purpose.
    !     --------


    !**   Interface.
    !     ----------
    !        *call* *trltog(...)

    !        Explicit arguments :
    !        --------------------
    !           PREEL_REAL    -  Latitudinal data ready for direct FFT (input)
    !           PGP    -  Blocked grid point data    (output)
    !           KVSET    - "v-set" for each field      (input)

    !        Implicit arguments :
    !        --------------------

    !     Method.
    !     -------
    !        See documentation

    !     Externals.
    !     ----------

    !     Reference.
    !     ----------
    !        ECMWF Research Department documentation of the IFS

    !     Author.
    !     -------
    !        MPP Group *ECMWF*

    !     Modifications.
    !     --------------
    !        Original  : 95-10-01
    !        D.Dent    : 97-08-04 Reorganisation to allow NPRTRV
    !                             to differ from NPRGPEW
    !        =99-03-29= Mats Hamrud and Deborah Salmond
    !                   JUMP in FFT's changed to 1
    !                   INDEX introduced and ZCOMBUF not used for same PE
    !         01-11-23  Deborah Salmond and John Hague
    !                   LIMP_NOOLAP Option for non-overlapping message passing
    !                               and buffer packing
    !         01-12-18  Peter Towers
    !                   Improved vector performance of LTOG_PACK,LTOG_UNPACK
    !         03-0-02   G. Radnoti: Call barrier always when nproc>1
    !         08-01-01  G.Mozdzynski: cleanup
    !         09-01-02  G.Mozdzynski: use non-blocking recv and send
    !     ------------------------------------------------------------------

    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRB, JPRBT
    USE YOMHOOK,                ONLY: LHOOK, DR_HOOK, JPHOOK
    USE MPL_MODULE,             ONLY: MPL_WAIT, MPL_BARRIER
    USE TPM_GEN,                ONLY: LSYNC_TRANS, NERR
    USE EQ_REGIONS_MOD,         ONLY: MY_REGION_EW, MY_REGION_NS
    USE TPM_DISTR,              ONLY: D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV
    USE PE2SET_MOD,             ONLY: PE2SET
    USE MPL_DATA_MODULE,        ONLY: MPL_COMM_OML
    USE OML_MOD,                ONLY: OML_MY_THREAD
    USE ABORT_TRANS_MOD,        ONLY: ABORT_TRANS
#if ECTRANS_HAVE_MPI
    USE MPI_F08,                ONLY: MPI_COMM, MPI_REQUEST, MPI_FLOAT, MPI_DOUBLE
    ! Missing: MPI_ISEND, MPI_IRECV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157)
#endif
    USE TPM_STATS,              ONLY: GSTATS => GSTATS_NVTX
    USE TPM_TRANS,              ONLY: LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE ISO_C_BINDING,          ONLY: C_SIZE_T
    USE OPENACC_EXT,            ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, &
      &                               EXT_ACC_DELETE
    USE OPENACC,                ONLY: ACC_HANDLE_KIND

    IMPLICIT NONE

#ifdef OMPGPU
  include 'mpif.h'
#endif


    REAL(KIND=JPRBT),  INTENT(INOUT), POINTER  :: PREEL_REAL(:)
    INTEGER(KIND=JPIM),INTENT(IN)  :: KF_FS,KF_GP
    INTEGER(KIND=JPIM),INTENT(IN)  :: KF_UV_G, KF_SCALARS_G
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:)
    REAL(KIND=JPRB),OPTIONAL,INTENT(OUT)     :: PGP(:,:,:)
    REAL(KIND=JPRB),OPTIONAL,INTENT(OUT)     :: PGPUV(:,:,:,:)
    REAL(KIND=JPRB),OPTIONAL,INTENT(OUT)     :: PGP3A(:,:,:,:)
    REAL(KIND=JPRB),OPTIONAL,INTENT(OUT)     :: PGP3B(:,:,:,:)
    REAL(KIND=JPRB),OPTIONAL,INTENT(OUT)     :: PGP2(:,:,:)
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:)
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:)
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:)
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:)
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:)

    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRLTOG_HANDLE) :: HTRLTOG

    ! LOCAL VARIABLES

    REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:)

    INTEGER(KIND=JPIM) :: ISENDTOT (NPROC)
    INTEGER(KIND=JPIM) :: IRECVTOT (NPROC)
    INTEGER(KIND=JPIM) :: IREQ     (NPROC*2)
    INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC)
    INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC)

    INTEGER(KIND=JPIM) :: JFLD, J, JI, J1, J2, JGL, JK, JL, IFLDS, JROC, INR, INS
    INTEGER(KIND=JPIM) :: IFIRSTLAT, ILASTLAT, IFLD, IGL, IGLL,&
                 &IPOS, ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, &
                 &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, &
                 &JBLK, ILAT_STRIP

    ! Contains FIELD, PARS, LEVS
    INTEGER(KIND=JPIM) :: IGP_OFFSETS(KF_GP,3)
    INTEGER(KIND=JPIM), PARAMETER :: IGP_OFFSETS_UV=1, IGP_OFFSETS_GP2=2, IGP_OFFSETS_GP3A=3, IGP_OFFSETS_GP3B=4
    INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF

    INTEGER(KIND=JPIM) :: IFLDA(KF_GP)
    INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF,2),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V
    INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V
    INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V
    INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V
    INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:)
    INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V

    INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G)
    INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G)
    INTEGER(KIND=JPIM) :: IVSET(KF_GP)
    INTEGER(KIND=JPIM) :: J3,IFGP2,IFGP3A,IFGP3B

    REAL(KIND=JPHOOK) :: ZHOOK_HANDLE
    REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR

    TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins...
    INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0

#if ECTRANS_HAVE_MPI
    TYPE(MPI_COMM) :: LOCAL_COMM
    TYPE(MPI_REQUEST) :: IREQUEST(NPROC*2)
#endif

#ifdef PARKINDTRANS_SINGLE
#define TRLTOG_DTYPE MPI_FLOAT
#else
#define TRLTOG_DTYPE MPI_DOUBLE
#endif
#if ECTRANS_HAVE_MPI
    LOCAL_COMM%MPI_VAL = MPL_COMM_OML( OML_MY_THREAD() )
#endif
    !     ------------------------------------------------------------------

    !*       0.    Some initializations
    !              --------------------
    IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE)

    ! Note we have either
    ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or
    ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2)
    ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which
    ! should match PSPXXX and PGPXXX arrays)


    ! We first get the decomposition individually
    IVSETUV(:) = -1
    IF (PRESENT(KVSETUV)) IVSETUV(:) = KVSETUV(:)
    IVSETSC(:)=-1
    IF (PRESENT(KVSETSC)) THEN
      IVSETSC(:) = KVSETSC(:)
    ELSE
      IOFF=0
      IF (PRESENT(KVSETSC2)) THEN
        IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC2))=KVSETSC2(:)
        IOFF = IOFF+SIZE(KVSETSC2)
      ENDIF
      IF (PRESENT(KVSETSC3A)) THEN
        DO J3=1,MERGE(UBOUND(PGP3A,3),UBOUND(PGP3A,3)/3,.NOT. LSCDERS)
          IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:)
          IOFF=IOFF+SIZE(KVSETSC3A)
        ENDDO
      ENDIF
      IF (PRESENT(KVSETSC3B)) THEN
        ! If SCDERS is on, the size of PGP is 3X larger because it is
        ! holding various derivatives. The problem is that those are
        ! at different non-contiguous positions, hence we treat them
        ! as separate fields
        DO J3=1,MERGE(UBOUND(PGP3B,3),UBOUND(PGP3B,3)/3,.NOT. LSCDERS)
          IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:)
          IOFF=IOFF+SIZE(KVSETSC3B)
        ENDDO
      ENDIF
      IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN
        CALL ABORT_TRANS("TRLTOG: Error in IVSETSC computation")
      ENDIF
    ENDIF

    ! Now from UV and Scalars decomposition we get the full decomposition
    IOFF=0
    IF (KF_UV_G > 0) THEN
      IF (LVORGP) THEN
        IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:)
        IOFF=IOFF+KF_UV_G
      ENDIF
      IF ( LDIVGP) THEN
        IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:)
        IOFF=IOFF+KF_UV_G
      ENDIF
      IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:)
      IOFF=IOFF+KF_UV_G
      IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:)
      IOFF=IOFF+KF_UV_G
    ENDIF
    IF (KF_SCALARS_G > 0) THEN
      IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:)
      IOFF=IOFF+KF_SCALARS_G
      IF (LSCDERS) THEN
        IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:)
        IOFF=IOFF+KF_SCALARS_G
      ENDIF
    ENDIF
    IF (KF_UV_G > 0 .AND. LUVDER) THEN
      IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:)
      IOFF=IOFF+KF_UV_G
      IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:)
      IOFF=IOFF+KF_UV_G
    ENDIF
    IF (KF_SCALARS_G > 0) THEN
      IF (LSCDERS) THEN
        IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:)
        IOFF=IOFF+KF_SCALARS_G
      ENDIF
    ENDIF

    IF (.NOT. PRESENT(PGP)) THEN
      ! This is only relevant if we use the split interface (i.e. not PGP)

      IGP2PAR = 0
      IGP3APAR = 0
      IGP3ALEV = 0
      IGP3BPAR = 0
      IGP3BLEV = 0
      IF (PRESENT(PGP2)) THEN
        IGP2PAR = UBOUND(PGP2,2)
        IF(LSCDERS) IGP2PAR = IGP2PAR/3
      ENDIF
      IF (PRESENT(PGP3A)) THEN
        IGP3ALEV = UBOUND(PGP3A,2)
        IGP3APAR = UBOUND(PGP3A,3)
        IF(LSCDERS) IGP3APAR = IGP3APAR/3
      ENDIF
      IF (PRESENT(PGP3B)) THEN
        IGP3BLEV = UBOUND(PGP3B,2)
        IGP3BPAR = UBOUND(PGP3B,3)
        IF(LSCDERS) IGP3BPAR = IGP3BPAR/3
      ENDIF
      IF (IGP2PAR + IGP3ALEV*IGP3APAR + IGP3BPAR*IGP3BLEV /= KF_SCALARS_G) THEN
        WRITE(NERR,*) IGP2PAR, IGP3APAR, IGP3ALEV, IGP3BPAR, IGP3BLEV
        CALL ABORT_TRANS("INCONSISTENCY IN SCALARS")
      ENDIF

      ! This is only relevant if we use the split interface (i.e. not PGP)
      IUVPAR = 1
      IOFF=1
      IF(LVORGP) THEN
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/)
        IUVPAR=IUVPAR+1
        IOFF=IOFF+KF_UV_G
      ENDIF

      IF(LDIVGP) THEN
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/)
        IUVPAR=IUVPAR+1
        IOFF=IOFF+KF_UV_G
      ENDIF

      ! U
      IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV
      IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR
      IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/)
      IUVPAR=IUVPAR+1
      IOFF=IOFF+KF_UV_G

      ! V
      IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV
      IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR
      IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/)
      IUVPAR=IUVPAR+1
      IOFF=IOFF+KF_UV_G

      ! Scalars
      ! PGP2
      IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2
      IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J, J=1,IGP2PAR)/)
      IOFF=IOFF+IGP2PAR
      ! PGP3A
      IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A
      IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/)
      IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/)
      IOFF=IOFF+IGP3APAR*IGP3ALEV
      ! PGP3B
      IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B
      IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/)
      IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/)
      IOFF=IOFF+IGP3BPAR*IGP3BLEV

      IF(LSCDERS) THEN
        !Scalars NS Derivatives
        ! PGP2
        IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2
        IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+IGP2PAR, J=1,IGP2PAR)/)
        IOFF=IOFF+IGP2PAR
        ! PGP3A
        IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A
        IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/)
        IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/)
        IOFF=IOFF+IGP3APAR*IGP3ALEV
        ! PGP3B
        IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B
        IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/)
        IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/)
        IOFF=IOFF+IGP3BPAR*IGP3BLEV
      ENDIF

      IF(LUVDER) THEN
        ! U Derivative NS
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/)
        IUVPAR=IUVPAR+1
        IOFF=IOFF+KF_UV_G

        ! V Derivative NS
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR
        IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/)
        IUVPAR=IUVPAR+1
        IOFF=IOFF+KF_UV_G
      ENDIF

      IF(LSCDERS) THEN
        !Scalars NS Derivatives
        ! PGP2
        IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2
        IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+2*IGP2PAR, J=1,IGP2PAR)/)
        IOFF=IOFF+IGP2PAR
        ! PGP3A
        IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A
        IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+2*IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/)
        IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/)
        IOFF=IOFF+IGP3APAR*IGP3ALEV
        ! PGP3B
        IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B
        IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+2*IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/)
        IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/)
        IOFF=IOFF+IGP3BPAR*IGP3BLEV
      ENDIF
    ENDIF

    CALL GSTATS(1806,0)

    ! Prepare receiver arrays
    ! find number of fields on a certain V-set
    IF(NPRTRV == 1) THEN
      ! This is needed because KVSET(JFLD) == -1 if there is only one V-set
      IRECV_FIELD_COUNT(1) = KF_GP
    ELSE
      IRECV_FIELD_COUNT(:) = 0
      DO JFLD=1,KF_GP
        IRECV_FIELD_COUNT(IVSET(JFLD)) = IRECV_FIELD_COUNT(IVSET(JFLD)) + 1
      ENDDO
    ENDIF
    ! find number of grid-points on a certain W-set that overlap with myself
    IRECV_WSET_SIZE(:) = 0
    DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS)
      ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1
      IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = &
          & IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW)
    ENDDO
    ! sum up offsets
    IRECV_WSET_OFFSET(1) = 0
    DO JROC=1,NPRTRW
      IRECV_WSET_OFFSET(JROC+1)=IRECV_WSET_OFFSET(JROC)+IRECV_WSET_SIZE(JROC)
    ENDDO
    DO JROC=1,NPROC
      CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV)
      ! total recv size is # points per field * # fields
      IRECVTOT(JROC) = IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV)
    ENDDO

    ! Prepare sender arrays
    IIN_TO_SEND_BUFR_OFFSET(1) = 0
    DO JROC=1,NPROC
      ! Get new offset to my current KINDEX entry
      IF (JROC > 1 .AND. KF_FS > 0) THEN
        IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1)+ISENDTOT(JROC-1)/KF_FS
      ELSEIF (JROC > 1) THEN
        IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1)
      ENDIF

      CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV)

      ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set)
      ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver
      ! (me, the W-set). Ideally those conincide, at least mostly.
      IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA))
      ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set)
      ILASTLAT  = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA))

      IPOS = 0
      DO JGL=IFIRSTLAT,ILASTLAT
        ! get from "actual" latitude to the latitude strip offset
        IGL  = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA)
        ! get from "actual" latitude to the latitude offset
        IGLL = JGL-D%NPTRLS(MYSETW)+1
        DO JL=1,D%NONL(IGL,ISETB)
          IPOS = IPOS+1
          ! offset to first layer of this gridpoint
          IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,1) = &
              & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1)
          ! distance between two layers of this gridpoint
          IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,2) = &
              & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL)
        ENDDO
      ENDDO
      !we always receive the full fourier space
      ISENDTOT(JROC) = IPOS*KF_FS
    ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA COPYIN(IGP_OFFSETS) ASYNC(1)
#endif

    ACC_POINTERS_CNT = 0
    IF (PRESENT(PGP)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP)
    ENDIF
    IF (PRESENT(PGPUV)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV)
    ENDIF
    IF (PRESENT(PGP2)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2)
    ENDIF
    IF (PRESENT(PGP3A)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A)
    ENDIF
    IF (PRESENT(PGP3B)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B)
    ENDIF
    IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA IF(PRESENT(PGP))   PRESENT(PGP) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGP2))  PRESENT(PGP2) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1)

    ! Present until self contribution and packing are done
    !$ACC DATA COPYIN(IIN_TO_SEND_BUFR) PRESENT(PREEL_REAL) IF(KF_FS > 0) ASYNC(1)
#endif
#ifdef OMPGPU
#endif

    CALL GSTATS(1806,1)

    ! Copy local contribution
    IF(ISENDTOT(MYPROC) > 0) THEN
      ! I have to send something to myself...

      ! Input is KF_GP fields. We find the resulting KF_FS fields.
      IFLDS = 0
      DO JFLD=1,KF_GP
        IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN
          IFLDS = IFLDS+1
          IF(PRESENT(KPTRGP)) THEN
            IFLDA(IFLDS) = KPTRGP(JFLD)
          ELSE
            IFLDA(IFLDS) = JFLD
          ENDIF
        ENDIF
      ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1)
#endif

      CALL GSTATS(1604,0)

      IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(MYSETW)
      IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(MYSETW)
      IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC)
      IF (PRESENT(PGP)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) &
        !$ACC&         FIRSTPRIVATE(KF_FS,IRECV_WSET_SIZE_V,IRECV_WSET_OFFSET_V, &
        !$ACC&         IIN_TO_SEND_BUFR_V,NPROMA) ASYNC(1)
#endif
        DO JFLD=1,KF_FS
          DO JL=1,IRECV_WSET_SIZE_V
            JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD = IFLDA(JFLD)
            IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ &
                & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1
            PGP(JK,IFLD,JBLK) = PREEL_REAL(IPOS)
          ENDDO
        ENDDO
      ELSE
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) &
        !$ACC&              FIRSTPRIVATE(KF_FS,IRECV_WSET_SIZE_V,IRECV_WSET_OFFSET_V, &
        !$ACC&              IIN_TO_SEND_BUFR_V,NPROMA) ASYNC(1)
#endif
        DO JFLD=1,KF_FS
          DO JL=1,IRECV_WSET_SIZE_V
            JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD = IFLDA(JFLD)
            IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ &
                & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1
            IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN
              PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS)
            ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN
              PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PREEL_REAL(IPOS)
            ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN
              PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS)
            ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN
              PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS)
            ENDIF
          ENDDO
        ENDDO
      ENDIF
      CALL GSTATS(1604,1)

#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC END DATA
#endif

    ENDIF

    ! Figure out processes that send or recv something
    ISEND_COUNTS   = 0
    IRECV_COUNTS   = 0
    DO JROC=1,NPROC
      IF( JROC /= MYPROC) THEN
        IF(IRECVTOT(JROC) > 0) THEN
          ! I have to recv something, so let me store that
          IRECV_COUNTS = IRECV_COUNTS + 1
          IRECV_TO_PROC(IRECV_COUNTS)=JROC
        ENDIF
        IF(ISENDTOT(JROC) > 0) THEN
          ! I have to send something, so let me store that
          ISEND_COUNTS = ISEND_COUNTS+1
          ISEND_TO_PROC(ISEND_COUNTS)=JROC
        ENDIF
      ENDIF
    ENDDO

    ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1))
    ICOMBUFS_OFFSET(1) = 0
    DO JROC=1,ISEND_COUNTS
      ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC))
    ENDDO
    ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1))
    ICOMBUFR_OFFSET(1) = 0
    DO JROC=1,IRECV_COUNTS
      ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC))
    ENDDO

    IF (IRECV_COUNTS > 0) THEN
      CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),&
          & 1_C_SIZE_T, int(ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1)),kind=c_size_t))
    ENDIF
    IF (ISEND_COUNTS > 0) THEN
      CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),&
          & int(ALIGN(KF_GP*D%NGPTOT*SIZEOF(ZCOMBUFR(1)),128)+1,kind=c_size_t), &
          & int(ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1)),kind=c_size_t))
    ENDIF

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA PRESENT(ZCOMBUFS) IF(ISEND_COUNTS > 0) ASYNC(1)
#endif
    CALL GSTATS(1605,0)
    DO INS=1,ISEND_COUNTS
      IPROC = ISEND_TO_PROC(INS)
      ILEN = ISENDTOT(IPROC)/KF_FS
      IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC)
      ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) FIRSTPRIVATE(KF_FS,ILEN,IIN_TO_SEND_BUFR_V, &
      !$ACC&              ICOMBUFS_OFFSET_V) COLLAPSE(2) ASYNC(1)
#endif
      DO JFLD=1,KF_FS
        DO JL=1,ILEN
          IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ &
              & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1
          ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PREEL_REAL(IPOS)
        ENDDO
      ENDDO
    ENDDO
    CALL GSTATS(1605,1)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END DATA ! ZCOMBUFS

    !$ACC END DATA ! PREEL_REAL

    !$ACC WAIT(1)
#endif

    CALL GSTATS(805,0)

    IF (LSYNC_TRANS) THEN
      CALL GSTATS(440,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(440,1)
    ENDIF
    CALL GSTATS(421,0)

    IR=0
    !...Receive loop.........................................................
#ifdef USE_GPU_AWARE_MPI
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR)
#endif
#else
    !! this is safe-but-slow fallback for running without GPU-aware MPI
    !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0)
#endif
    DO INR=1,IRECV_COUNTS
      IR=IR+1
      IRECV=IRECV_TO_PROC(INR)
#if ECTRANS_HAVE_MPI
      CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), &
        & IRECVTOT(IRECV), &
        & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, &
        & MTAGLG, LOCAL_COMM, IREQUEST(IR), &
        & IERROR )
      IREQ(IR) = IREQUEST(IR)%MPI_VAL
#else
      CALL ABORT_TRANS("Should not be here: MPI is disabled")
#endif
    ENDDO

    !...Send loop.........................................................
    DO INS=1,ISEND_COUNTS
      IR=IR+1
      ISEND=ISEND_TO_PROC(INS)
#if ECTRANS_HAVE_MPI
      CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), &
        & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR)
      IREQ(IR) = IREQUEST(IR)%MPI_VAL
#else
        CALL ABORT_TRANS("Should not be here: MPI is disabled")
#endif
    ENDDO

    IF(IR > 0) THEN
      CALL MPL_WAIT(KREQUEST=IREQ(1:IR), &
      & CDSTRING='TRLTOG: WAIT FOR SENDS AND RECEIVES')
    ENDIF

#ifdef USE_GPU_AWARE_MPI
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END HOST_DATA
#endif
#else
    !! this is safe-but-slow fallback for running without GPU-aware MPI
    !$ACC UPDATE DEVICE(ZCOMBUFR) IF(IRECV_COUNTS > 0)
#endif

    IF (LSYNC_TRANS) THEN
      CALL GSTATS(441,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(441,1)
    ENDIF
    CALL GSTATS(421,1)

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA PRESENT(ZCOMBUFR) IF(IRECV_COUNTS > 0) ASYNC(1)
#endif
    CALL GSTATS(805,1)

    !  Unpack loop.........................................................

    CALL GSTATS(1606,0)
    DO INR=1,IRECV_COUNTS
      IRECV=IRECV_TO_PROC(INR)
      CALL PE2SET(IRECV,ISETA,ISETB,ISETW,ISETV)

      IRECV_FIELD_COUNT_V = IRECV_FIELD_COUNT(ISETV)
      ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR)

      IFLDS = 0
      DO JFLD=1,KF_GP
        IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN
          IFLDS = IFLDS+1
          IF(PRESENT(KPTRGP)) THEN
            IFLDA(IFLDS)=KPTRGP(JFLD)
          ELSE
            IFLDA(IFLDS)=JFLD
          ENDIF
        ENDIF
      ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC DATA COPYIN(IFLDA(1:IRECV_FIELD_COUNT_V)) ASYNC(1)
#endif

      IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(ISETW)
      IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW)
      IF (PRESENT(PGP)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) &
        !$ACC&              FIRSTPRIVATE(IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V,&
        !$ACC&              IRECV_WSET_OFFSET_V,NPROMA,ICOMBUFR_OFFSET_V) ASYNC(1)
#endif
        DO JFLD=1,IRECV_FIELD_COUNT_V
          DO JL=1,IRECV_WSET_SIZE_V
            JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD=IFLDA(JFLD)
            JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL
            PGP(JK,IFLD,JBLK) = ZCOMBUFR(JI)
          ENDDO
        ENDDO
      ELSE
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) &
        !$ACC&              FIRSTPRIVATE(IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V, &
        !$ACC&              IRECV_WSET_OFFSET_V,NPROMA,ICOMBUFR_OFFSET_V) ASYNC(1)
#endif
        DO JFLD=1,IRECV_FIELD_COUNT_V
          DO JL=1,IRECV_WSET_SIZE_V
            JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD=IFLDA(JFLD)
            JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL
            IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN
              PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI)
            ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN
              PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI)
            ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN
              PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI)
            ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN
              PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI)
            ENDIF
          ENDDO
        ENDDO
      ENDIF
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC END DATA
#endif
    ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END DATA ! ZOMBUFR
#endif
    IF (LSYNC_TRANS) THEN
#ifdef ACCGPU
      !$ACC WAIT(1)
#endif
      CALL GSTATS(440,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(440,1)
    ENDIF
    CALL GSTATS(422,0)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END DATA ! PGP3B
    !$ACC END DATA ! PGP3A
    !$ACC END DATA ! PGP2
    !$ACC END DATA ! PGPUV
    !$ACC END DATA ! PGP
#endif
    IF (PRESENT(PGP)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC UPDATE HOST(PGP) ASYNC(1)
#endif
    ENDIF
    IF (PRESENT(PGPUV)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC UPDATE HOST(PGPUV) ASYNC(1)
#endif
    ENDIF
    IF (PRESENT(PGP2)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC UPDATE HOST(PGP2) ASYNC(1)
#endif
    ENDIF
    IF (PRESENT(PGP3A)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC UPDATE HOST(PGP3A) ASYNC(1)
#endif
    ENDIF
    IF (PRESENT(PGP3B)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC UPDATE HOST(PGP3B) ASYNC(1)
#endif
    ENDIF
    IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND)
    IF (LSYNC_TRANS) THEN
#ifdef ACCGPU
      !$ACC WAIT(1)
#endif
      CALL GSTATS(442,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(442,1)
    ENDIF
    CALL GSTATS(422,1)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END DATA ! IRECVBUFR_TO_OUT,PGPINDICES

    !$ACC WAIT(1)
#endif

    CALL GSTATS(1606,1)

    IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE)
  END SUBROUTINE TRLTOG
END MODULE TRLTOG_MOD

